{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "view-in-github"
   },
   "source": [
    "<a href=\"https://colab.research.google.com/github/CoreTheGreat/HBPU-Machine-Learning-Course/blob/main/ML_Chapter4_Clustering.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "lPboLx_o0UxI"
   },
   "source": [
    "# 第四章：聚类\n",
    "湖北理工学院《机器学习》课程资料\n",
    "\n",
    "作者：李辉楚吴\n",
    "\n",
    "笔记内容概述: K均值聚类 k-Means\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "c-Xj4WtjPdwD"
   },
   "source": [
    "### 一维k-Means聚类的应用: 二值分割（Binary Segmentation）\n",
    "\n",
    "Step 1: 加载数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 0
    },
    "id": "JWUUOgNeEyZ6",
    "outputId": "6d084a58-c228-4ed7-ec76-5d5a03f34679"
   },
   "outputs": [],
   "source": [
    "import cv2\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "label_size = 18 # Label size\n",
    "ticklabel_size = 14 # Tick label size\n",
    "\n",
    "# Load the car number image\n",
    "image_carno = cv2.imread('./Data/car_num.jpg')\n",
    "\n",
    "# Convert from BGR to RGB\n",
    "image_carno = cv2.cvtColor(image_carno, cv2.COLOR_BGR2RGB)\n",
    "\n",
    "# Display the image\n",
    "fig, ax = plt.subplots(figsize=(7,7))\n",
    "img = ax.imshow(image_carno)\n",
    "plt.axis('off')  # Hide axes\n",
    "plt.savefig('carno_base.png', dpi=300) # Make figure clearer\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Step 2: 灰度化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 0
    },
    "id": "t8zeSOFyPs29",
    "outputId": "18c5d388-99d6-45a3-9715-9db7cf285116"
   },
   "outputs": [],
   "source": [
    "# Change image_carno into grey image\n",
    "image_carno_grey = cv2.cvtColor(image_carno, cv2.COLOR_BGR2GRAY)\n",
    "\n",
    "# Display the image\n",
    "fig, ax = plt.subplots(figsize=(7,7))\n",
    "img = ax.imshow(image_carno_grey, cmap='gray')\n",
    "plt.axis('off')  # Hide axes\n",
    "plt.savefig('carno_grey.png', dpi=300) # Make figure clearer\n",
    "plt.show()\n",
    "\n",
    "# Create a histogram of grey values\n",
    "plt.figure(figsize=(8, 4))\n",
    "plt.hist(image_carno_grey.ravel(), bins=256, range=(0, 256))\n",
    "plt.ylim(0, 5000)\n",
    "plt.title('Histogram of Grey Values', fontsize=label_size)\n",
    "plt.xlabel('Pixel Value', fontsize=label_size)\n",
    "plt.ylabel('Frequency', fontsize=label_size)\n",
    "plt.xticks(fontsize=ticklabel_size)\n",
    "plt.yticks(fontsize=ticklabel_size)\n",
    "plt.savefig('carno_grey_histogram.png', dpi=300, bbox_inches='tight')\n",
    "plt.show()\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Step 3: 根据灰度分为两类，生成掩码\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 0
    },
    "id": "MzSX7UoKQwyq",
    "outputId": "a5cf9244-4df4-4600-b435-1eda9b4adeba"
   },
   "outputs": [],
   "source": [
    "import time\n",
    "import numpy as np\n",
    "\n",
    "# Reshape image_carno_grey into a 1-D vector\n",
    "x = image_carno_grey.reshape(-1)\n",
    "\n",
    "# Using k-Means to separate background and foreground by pixels\n",
    "k = 2\n",
    "\n",
    "# Initialize cluster centers\n",
    "cluster_centers = np.random.rand(k) * 255\n",
    "start_centers = cluster_centers.copy()\n",
    "\n",
    "# Initialize distance array\n",
    "distance = np.zeros((len(x), k))\n",
    "\n",
    "# Iteration\n",
    "start_time = time.time()\n",
    "max_iter = 1000\n",
    "for iter_id in range(max_iter):\n",
    "    # Calculate distance between points to each center\n",
    "    for i in range(k):\n",
    "        distance[:,i] = np.abs(x - cluster_centers[i])\n",
    "\n",
    "    # Assign to closest centroid\n",
    "    cluster_idx = np.argmin(distance, axis=1)\n",
    "\n",
    "    # Update cluster centers\n",
    "    cluster_centers_prior = cluster_centers.copy()\n",
    "    for i in range(k):\n",
    "        cluster_centers[i] = np.mean(x[cluster_idx == i])\n",
    "\n",
    "    # Check if cluster_centers are stable enough to stop training\n",
    "    print(f'Iteration {iter_id}: Updated centers {cluster_centers}, Prior centers {cluster_centers_prior}')\n",
    "    if np.sum(np.abs(cluster_centers-cluster_centers_prior)) == 0:\n",
    "        break\n",
    "\n",
    "    cluster_centers_prior = cluster_centers\n",
    "\n",
    "end_time = time.time()\n",
    "print(f'Stop after iteration {iter_id}, time consumption is {end_time-start_time}')\n",
    "\n",
    "# Generate segmentation mask\n",
    "carno_mask_pixel = np.zeros_like(cluster_idx)\n",
    "low_value_cluster = np.argmin(cluster_centers)\n",
    "carno_mask_pixel[cluster_idx != low_value_cluster] = 1 # Set pixels with higher grey value to 1\n",
    "carno_mask_pixel = carno_mask_pixel.reshape(image_carno_grey.shape)\n",
    "\n",
    "# Display the mask\n",
    "fig, ax = plt.subplots(figsize=(7,7))\n",
    "img = ax.imshow(carno_mask_pixel, cmap='gray')\n",
    "ax.set_title(f'Final Centers: {cluster_centers}, Iteration: {iter_id}', fontsize=label_size)\n",
    "plt.axis('off')  # Hide axes\n",
    "plt.savefig('carno_mask_pixel_2.png', dpi=300) # Make figure clearer\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "DJahqo4HcdkW"
   },
   "source": [
    "### 多维k-Means聚类应用：图像压缩 (Image Compression)\n",
    "\n",
    "Step 1: 加载数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 653
    },
    "id": "F0zcJGAle3fC",
    "outputId": "e27bc9f0-5afb-4039-fb64-8d57ab396f0c"
   },
   "outputs": [],
   "source": [
    "# Load the car number image\n",
    "image_bmwk = cv2.imread('./Data/bmwk.png')\n",
    "\n",
    "# Convert from BGR to RGB\n",
    "image_bmwk_rgb = cv2.cvtColor(image_bmwk, cv2.COLOR_BGR2RGB)\n",
    "\n",
    "x_r = image_bmwk_rgb[:, :, 0].reshape(-1) # Store colors in red channel\n",
    "x_g = image_bmwk_rgb[:, :, 1].reshape(-1) # Store colors in green channel\n",
    "x_b = image_bmwk_rgb[:, :, 2].reshape(-1) # Store colors in blue channel\n",
    "\n",
    "# Display the image with no margin\n",
    "plt.figure(figsize=(image_bmwk_rgb.shape[1]/100, image_bmwk_rgb.shape[0]/100))  # Convert pixels to inches\n",
    "plt.imshow(image_bmwk_rgb)\n",
    "plt.axis('off')  # Hide axes\n",
    "plt.subplots_adjust(left=0, right=1, top=1, bottom=0)  # Remove margins\n",
    "plt.savefig('bmwk.png', format='png', bbox_inches='tight', pad_inches=0)\n",
    "plt.show()\n",
    "\n",
    "# Compute image storing buffer\n",
    "image_buffer = x_r.size * 3  # 3 channels (R, G, B) for each pixel\n",
    "print(f\"Image buffer size: {image_buffer} bytes\")\n",
    "# Calculate image size in megabytes (MB)\n",
    "image_size_mb = image_buffer / (1024 * 1024)\n",
    "print(f\"Image size: {image_size_mb:.2f} MB\")\n",
    "\n",
    "# Calculate image dimensions\n",
    "height, width = image_bmwk_rgb.shape[:2]\n",
    "print(f\"Image dimensions: {width}x{height} pixels\")\n",
    "\n",
    "# Calculate total number of pixels\n",
    "total_pixels = height * width\n",
    "print(f\"Total number of pixels: {total_pixels}\")\n",
    "\n",
    "# Calculate bits per pixel\n",
    "bits_per_pixel = (image_buffer * 8) / total_pixels\n",
    "print(f\"Bits per pixel: {bits_per_pixel:.2f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Step 2: 使用k-Means进行聚类，观察不同k值的聚类结果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 1000
    },
    "id": "Ya0qjoWejixW",
    "outputId": "77cf3d3d-02af-4af3-ba9d-8e5ae7d87354"
   },
   "outputs": [],
   "source": [
    "from sklearn.cluster import KMeans\n",
    "\n",
    "def KMeansImage(img, k):\n",
    "    # Get image size\n",
    "    w, h, c = img.shape\n",
    "\n",
    "    # Reshape image along channel\n",
    "    x = np.reshape(img, (w * h, c))\n",
    "\n",
    "    # Train k-Means model\n",
    "    mdl_km = KMeans(n_clusters=k, n_init='auto')\n",
    "    mdl_km.fit(x)\n",
    "\n",
    "    # Predict labels of each pixels\n",
    "    labels = mdl_km.predict(x).reshape(w, h)\n",
    "\n",
    "    # Get centers\n",
    "    center_colors = mdl_km.cluster_centers_ / 255.0\n",
    "\n",
    "    # Use center colors to generate compressed image\n",
    "    img_comp = np.zeros((w, h, c))\n",
    "    for i in range(w):\n",
    "        for j in range(h):\n",
    "            img_comp[i][j] = center_colors[labels[i][j]]\n",
    "    return img_comp, center_colors\n",
    "\n",
    "claster_num = [2, 4, 8, 16, 32, 64]\n",
    "for k in claster_num:\n",
    "    img_comp, center_colors = KMeansImage(image_bmwk_rgb, k)\n",
    "\n",
    "    # Display center colors\n",
    "    fig, ax = plt.subplots(figsize=(16,1))\n",
    "    ax.imshow([center_colors])\n",
    "    plt.axis('off')\n",
    "    # plt.savefig(f'bmwk_center_{k}.png', format='png', compress_level=9)\n",
    "    plt.show()\n",
    "\n",
    "    # Display compressed image\n",
    "    plt.figure(figsize=(image_bmwk_rgb.shape[1]/100, image_bmwk_rgb.shape[0]/100))  # Convert pixels to inches\n",
    "    plt.imshow(img_comp)\n",
    "    plt.axis('off')\n",
    "    plt.subplots_adjust(left=0, right=1, top=1, bottom=0)  # Remove margins\n",
    "    plt.savefig(f'bmwk_comp_{k}.png', format='png', bbox_inches='tight', pad_inches=0)\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "DboOsOv-qBrg"
   },
   "source": [
    "### 确定k值——手肘法\n",
    "\n",
    "Step 1: 生成数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 807
    },
    "id": "lwmavWf1xLkH",
    "outputId": "0d98d49f-750e-47b7-e868-35d7628bcaac"
   },
   "outputs": [],
   "source": [
    "from sklearn.datasets import make_blobs\n",
    "\n",
    "# Using make_blobs to generate data of ten clustering\n",
    "X_mb, y_mb = make_blobs(n_samples=500, n_features=2, centers=6, random_state=42)\n",
    "\n",
    "fig, ax = plt.subplots(figsize=(10,10))\n",
    "ax.scatter(X_mb[:, 0], X_mb[:, 1], marker=\"o\", c=y_mb, s=10**2, edgecolor=\"k\")\n",
    "plt.axis('off')\n",
    "# plt.savefig(f'make_blobs_base.png', dpi=300)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 552
    },
    "id": "ZDA08BSl08Bf",
    "outputId": "4e48a683-8890-41b5-87f7-26a87dabcf99"
   },
   "outputs": [],
   "source": [
    "k_list = np.arange(2, 20, 1)\n",
    "sse_list = np.zeros(len(k_list))\n",
    "\n",
    "mdl_km_list = []\n",
    "for i in range(len(k_list)):\n",
    "    mdl_km = KMeans(n_clusters=k_list[i], n_init='auto')\n",
    "    mdl_km.fit(X_mb)\n",
    "    mdl_km_list.append(mdl_km)\n",
    "    sse_list[i] = mdl_km.inertia_\n",
    "\n",
    "# Plot sse_list\n",
    "fig, ax = plt.subplots(figsize=(8,6))\n",
    "ax.plot(k_list, sse_list, marker='o', linestyle='-', color='tab:blue')\n",
    "ax.set_xticks(k_list)\n",
    "ax.set_xlabel('Number of clusters (k)', fontsize=label_size)\n",
    "ax.set_ylabel('SSE', fontsize=label_size)\n",
    "ax.tick_params(axis='both', which='major', labelsize=ticklabel_size) # Set tick label size\n",
    "# plt.savefig(f'make_blobs_sse.png', dpi=300)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "Step 2: 观察不同k值的聚类结果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 1000
    },
    "id": "K-JcsrtB23wb",
    "outputId": "1d4e1981-6201-4ef3-9752-907f558bd646"
   },
   "outputs": [],
   "source": [
    "# Display clustering result of k = 5, 6, 7\n",
    "k_disp = [2, 3, 4, 5, 6, 7]\n",
    "for k in k_disp:\n",
    "    mdl_km = mdl_km_list[k-2]\n",
    "\n",
    "    fig, ax = plt.subplots(figsize=(10,10))\n",
    "    ax.scatter(X_mb[:, 0], X_mb[:, 1], marker=\"o\", c=mdl_km_list[k-2].labels_, s=10**2, edgecolor=\"k\")\n",
    "    ax.set_title(f'Number of clusters (k): {k}', fontsize=label_size)\n",
    "    plt.axis('off')\n",
    "\n",
    "    plt.savefig(f'make_blobs_{k}.png', dpi=300)\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "pBvdXh285cam"
   },
   "source": [
    "在Moon data上进行聚类"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 807
    },
    "id": "9qf2NtDdqFTl",
    "outputId": "d5552f56-4dc4-4d24-cd7b-0d81137e9955"
   },
   "outputs": [],
   "source": [
    "from sklearn.datasets import make_moons\n",
    "\n",
    "# Generate virtual data (Moon data)\n",
    "X_mm, y_mm = make_moons(n_samples=400, noise=0.05, random_state=42)\n",
    "\n",
    "# Normalize X_mm using z-score\n",
    "X_mm = (X_mm - np.min(X_mm, axis=0)) / (np.max(X_mm, axis=0)-np.min(X_mm, axis=0))\n",
    "\n",
    "fig, ax = plt.subplots(figsize=(10,10))\n",
    "ax.scatter(X_mm[:, 0], X_mm[:, 1], marker=\"o\", c=y_mm, s=10**2, edgecolor=\"k\")\n",
    "plt.axis('off')\n",
    "# plt.savefig(f'make_moon_base.png', dpi=300)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "3xKX1XHnuXF2"
   },
   "source": [
    "用肘部法确定k值\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 552
    },
    "id": "4INr3adN6iN5",
    "outputId": "65351e31-e708-42e7-b500-78060475ad43"
   },
   "outputs": [],
   "source": [
    "k_list = np.arange(2, 20, 1)\n",
    "sse_list = np.zeros(len(k_list))\n",
    "\n",
    "mdl_km_list = []\n",
    "for i in range(len(k_list)):\n",
    "    mdl_km = KMeans(n_clusters=k_list[i], n_init='auto')\n",
    "    mdl_km.fit(X_mm)\n",
    "    mdl_km_list.append(mdl_km)\n",
    "    sse_list[i] = mdl_km.inertia_\n",
    "\n",
    "# Plot sse_list\n",
    "fig, ax = plt.subplots(figsize=(8,6))\n",
    "ax.plot(k_list, sse_list, marker='o', linestyle='-', color='tab:blue')\n",
    "ax.set_xticks(k_list)\n",
    "ax.set_xlabel('Number of clusters (k)', fontsize=label_size)\n",
    "ax.set_ylabel('SSE', fontsize=label_size)\n",
    "ax.tick_params(axis='both', which='major', labelsize=ticklabel_size) # Set tick label size\n",
    "# plt.savefig(f'make_moon_sse.png', dpi=300)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "观察聚类结果，思考K-means方法存在的问题"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 1000
    },
    "id": "Y7wZOZqf698Y",
    "outputId": "ce51685e-cfa6-4aa8-af13-2ae76bc561c1"
   },
   "outputs": [],
   "source": [
    "# Display clustering result of k = 5, 6, 7\n",
    "k_disp = [2, 11, 19]\n",
    "for k in k_disp:\n",
    "    mdl_km = mdl_km_list[k-2]\n",
    "\n",
    "    fig, ax = plt.subplots(figsize=(10,10))\n",
    "    ax.scatter(X_mm[:, 0], X_mm[:, 1], marker=\"o\", c=mdl_km_list[k-2].labels_, s=10**2, edgecolor=\"k\")\n",
    "    ax.set_title(f'Number of clusters (k): {k}', fontsize=label_size)\n",
    "    plt.axis('off')\n",
    "\n",
    "    # plt.savefig(f'make_moon_{k}.png', dpi=300)\n",
    "    plt.show()"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "authorship_tag": "ABX9TyO+9EvQyLwzW6sRWnzqnjPu",
   "collapsed_sections": [
    "c-Xj4WtjPdwD"
   ],
   "include_colab_link": true,
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
